keep = ls()

##############################
#Clean 1874 Directory Data####
##############################

#Load People's Guide Data (OCR'd, cleaned by MTurkers)
raw_1874 = fread('./raw/Batch_4136847_batch_results.csv')

#Get unique identifiers
raw_1874[, county := str_extract(Input.image_url, "(?<=mturk[/])[a-z]+") %>%
           str_to_title()]
raw_1874[, page := str_extract(Input.image_url, "(?<=book[_])\\d{4}") %>% as.numeric]
raw_1874[, line := str_extract(Input.image_url, "\\d+(?=.png)") %>% as.numeric]

#Clean up demographic information
setkey(raw_1874, county, page, line)
temp = raw_1874[!(Answer.image_attributes.image_missing) & 
                  !(Answer.image_attributes.no_bio),
                list(first = paste(Answer.first_name, Answer.middle_init, sep = " "),
                     last = Answer.last_name,
                     birth_year = Answer.birth_year,
                     birth_place = Answer.birth_place,
                     settlement_year = Answer.settlement_year,
                     party = Answer.party,
                     census_county = str_extract(Input.image_url, "(?<=mturk[/])[a-z]+") %>%
                       str_to_title(),
                     flag = Answer.image_attributes.flag,
                     image_url = Input.image_url
                )]

#Clean names
temp[, paste("match", 
             c("first", "middle", "last", "first_clean"), 
             sep = "_") := clean_names(first = first,
                                       middle = NULL,
                                       last = last
             )]

#Prepare to match
directory_1874_match = temp[, list(image_url, 
                                   census_county,
                                   match_first,
                                   match_middle,
                                   match_last,
                                   match_first_clean,
                                   match_middle_init = match_middle %>% str_sub(1,1),
                                   first_sound = metaphone(match_first),
                                   last_sound = metaphone(match_last),
                                   birth_year = as.numeric(birth_year),
                                   birth_place,
                                   settlement_year,
                                   party)
                            ]
#Merge in cleaned birth places
directory_bp = fread('./raw/directory_1874_birth_places.csv')
setkey(directory_1874_match, birth_place)
setkey(directory_bp, birth_place)

directory_1874_match = directory_bp[directory_1874_match]


#Save cleaned directory data
fwrite(directory_1874_match, './cleaned/directory_to_match.csv')

#Cleanup
rm(list = setdiff(ls(), c(keep, 'keep')))
gc()